# load packages
library(tidyverse)
library(corrr)
library(plotly)
  1. Read in the gapminder_clean.csv data as a tibble using read_csv.
# load in gapminder dataset
gapminder <- read_csv("gapminder_clean.csv") %>% column_to_rownames(., var = 'X1')
## Warning: Missing column names filled in: 'X1' [1]
head(gapminder)
##   Country Name Year Agriculture, value added (% of GDP)
## 0  Afghanistan 1962                                  NA
## 1  Afghanistan 1967                                  NA
## 2  Afghanistan 1972                                  NA
## 3  Afghanistan 1977                                  NA
## 4  Afghanistan 1982                                  NA
## 5  Afghanistan 1987                                  NA
##   CO2 emissions (metric tons per capita)
## 0                             0.07378134
## 1                             0.12378238
## 2                             0.13082014
## 3                             0.18311831
## 4                             0.16587912
## 5                             0.27556031
##   Domestic credit provided by financial sector (% of GDP)
## 0                                               21.276422
## 1                                                9.917662
## 2                                               18.880833
## 3                                               13.836822
## 4                                                      NA
## 5                                                      NA
##   Electric power consumption (kWh per capita)
## 0                                          NA
## 1                                          NA
## 2                                          NA
## 3                                          NA
## 4                                          NA
## 5                                          NA
##   Energy use (kg of oil equivalent per capita)
## 0                                           NA
## 1                                           NA
## 2                                           NA
## 3                                           NA
## 4                                           NA
## 5                                           NA
##   Exports of goods and services (% of GDP)
## 0                                 4.878051
## 1                                 6.772908
## 2                                14.763231
## 3                                11.662904
## 4                                       NA
## 5                                       NA
##   Fertility rate, total (births per woman) GDP growth (annual %)
## 0                                    7.450                    NA
## 1                                    7.450                    NA
## 2                                    7.450                    NA
## 3                                    7.449                    NA
## 4                                    7.450                    NA
## 5                                    7.461                    NA
##   Imports of goods and services (% of GDP) Industry, value added (% of GDP)
## 0                                 9.349593                               NA
## 1                                14.209827                               NA
## 2                                18.105850                               NA
## 3                                14.823175                               NA
## 4                                       NA                               NA
## 5                                       NA                               NA
##   Inflation, GDP deflator (annual %) Life expectancy at birth, total (years)
## 0                                 NA                                33.21990
## 1                                 NA                                35.38941
## 2                                 NA                                37.61015
## 3                                 NA                                40.11015
## 4                                 NA                                43.23073
## 5                                 NA                                47.29634
##   Population density (people per sq. km of land area)
## 0                                            14.31206
## 1                                            15.88181
## 2                                            17.94703
## 3                                            19.99893
## 4                                            19.40232
## 5                                            17.36656
##   Services, etc., value added (% of GDP)      pop continent gdpPercap
## 0                                     NA 10267083      Asia  853.1007
## 1                                     NA 11537966      Asia  836.1971
## 2                                     NA 13079460      Asia  739.9811
## 3                                     NA 14880372      Asia  786.1134
## 4                                     NA 12881816      Asia  978.0114
## 5                                     NA 13867957      Asia  852.3959
  1. Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap for the filtered data.
# filter only Year 1962 
gapminder_1962 <- gapminder %>% filter(Year == "1962")

# plot gdp v CO2
gapminder_1962 %>% ggplot(aes(gdpPercap,`CO2 emissions (metric tons per capita)`)) + geom_point() +  labs(title = "CO2 emissions by gdpPercap,continent, population size")
## Warning: Removed 151 rows containing missing values (geom_point).

  1. On the filtered data, calculate the correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap. What is the correlation and associated p value?
gdp.co2.cor <- cor.test(x=gapminder_1962$gdpPercap,y=gapminder_1962$`CO2 emissions (metric tons per capita)`)
# correlation val
gdp.co2.cor$estimate
##       cor 
## 0.9260817
# correlation p val
gdp.co2.cor$p.value
## [1] 1.128679e-46
  1. On the unfiltered data, answer “In what year is the correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?” Filter the dataset to that year for the next step…
gapminder %>% group_by(Year) %>% summarize(correlation=cor(`CO2 emissions (metric tons per capita)`,gdpPercap,use = "na.or.complete")) %>% subset(correlation == max(correlation))
## # A tibble: 1 x 2
##    Year correlation
##   <dbl>       <dbl>
## 1  1967       0.939
gapminder_1967 <- gapminder %>% filter(Year=="1967")
  1. Using plotly, create an interactive scatter plot comparing 'CO2 emissions (metric tons per capita)' and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.
plt <- gapminder_1967 %>% ggplot(aes(gdpPercap,`CO2 emissions (metric tons per capita)`, color = continent, size = pop)) + geom_point() + labs(title = "CO2 emissions by gdpPercap,continent, population size", color = "continent", size = "")
ggplotly(plt)

Now, without further guidance, use your R Data Science skills (and appropriate statistical tests) to answer the following:

  1. What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)'? (stats test needed)

The Energy usage between continent is significantly different (p = 2e-16 < 0.05).

gapminder_contE <- gapminder %>% select(continent, `Energy use (kg of oil equivalent per capita)`) %>% drop_na()

contE_aov <- aov(`Energy use (kg of oil equivalent per capita)` ~ continent, data = gapminder_contE)
summary(contE_aov)
##              Df    Sum Sq   Mean Sq F value Pr(>F)    
## continent     4 7.715e+08 192870621   51.46 <2e-16 ***
## Residuals   843 3.160e+09   3748033                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990? (stats test needed)

There is no significant difference (p = 0.1776 > 0.05) between Europe and Asia’s Import of goods and services (% of GDP) after 1990.

gapminder_1990 <- gapminder %>% filter(Year > "1990" & continent %in% c("Asia", "Europe")) %>% select(continent, `Imports of goods and services (% of GDP)`) %>% drop_na()

t.test(`Imports of goods and services (% of GDP)` ~ continent, data = gapminder_1990)
## 
##  Welch Two Sample t-test
## 
## data:  Imports of goods and services (% of GDP) by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.321099 12.433240
## sample estimates:
##   mean in group Asia mean in group Europe 
##             46.84531             41.78924
  1. What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
gapminder %>% group_by(`Country Name`) %>% summarize(mean_popdensity = mean(`Population density (people per sq. km of land area)`)) %>% drop_na() %>% subset(mean_popdensity == max(mean_popdensity))
## # A tibble: 1 x 2
##   `Country Name`   mean_popdensity
##   <chr>                      <dbl>
## 1 Macao SAR, China          14732.
  1. What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?
gapminder %>% select(`Country Name`, Year, `Life expectancy at birth, total (years)`) %>% group_by(`Country Name`) %>% pivot_wider(names_from = Year, values_from = `Life expectancy at birth, total (years)`) %>% select(`Country Name`, `1962`, `2007`) %>% mutate(change_lifeexp = `2007` - `1962`) %>% drop_na() %>% subset(change_lifeexp == max(change_lifeexp))
## # A tibble: 1 x 4
## # Groups:   Country Name [1]
##   `Country Name` `1962` `2007` change_lifeexp
##   <chr>           <dbl>  <dbl>          <dbl>
## 1 Maldives         38.5   75.4           36.9